Author: Lily Cheng. Oct. 11th, 2020
import s3fs
import pandas as pd
# csv file
train_df = pd.read_json('s3://sagemaker-studio-528576943967-ssf9zkrg3os/train.json')
len(train_df)
train_df
# https://stackoverflow.com/questions/50217968/pandas-split-list-in-column-into-multiple-rows
new_df = pd.DataFrame([
[p, t, i] for p, t, I in train_df.values
for i in I
], columns=train_df.columns)
new_new_df = new_df.drop(columns=['cuisine'])
new_new_df
# conda install -c conda-forge cufflinks-py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
plt.style.use('ggplot')
import cufflinks as cf
import plotly.express as px
import plotly.offline as py
from plotly.offline import plot
import plotly.graph_objects as go
import plotly.graph_objs as go
# try:
# import apyori
# except:
# !pip install apyori
# from apyori import apriori
#Graph : Item by count
fig = px.bar(new_new_df["ingredients"].value_counts()[:30], orientation="v", color=new_new_df["ingredients"].value_counts()[:30], color_continuous_scale=px.colors.sequential.Plasma,
log_x=False, labels={'value':'Count',
'index':'Item',
'color':'None'
})
fig.update_layout(
font_color="black",
title_font_color="red",
legend_title_font_color="green",
title_text="Item by count"
)
fig.show()
There are some redundancy in the table, such as "egg" and "big eggs", "onion" and "purple onions", "pepper" and "grounded black pepper", and so on
df_no_redundancy = new_new_df.copy()
df_no_redundancy['ingredients'] = new_new_df['ingredients'].replace(['large eggs','eggs','purple onion','onions','ground black pepper','black pepper','olive oil','vegetable oil','sesame oil', 'garlic cloves', 'extra-virgin olive oil', 'minced garlic', 'unsalted butter', 'kosher salt', 'diced tomatoes', 'fresh ginger', 'brown sugar', 'garlic powder','fresh lemon juice','fresh lime juice', 'fresh parsley', 'cooking oil', 'canola oil', 'all-purpose flour', 'chopped onion', 'grated parmesan cheese', 'chopped cilantro fresh','ground cinnamon', 'fresh lime juice','boneless skinless chicken breasts', 'yellow onion', 'flat leaf parsley', 'lime juice','cilantro leaves','heavy cream', 'fresh basil','white sugar', 'jalapeno chilies'],
['egg', 'egg', 'onion', 'onion', 'pepper', 'pepper', 'oil', 'oil', 'oil', 'garlic', 'oil', 'garlic', 'butter', 'salt', 'tomatoes', 'ginger', 'sugar', 'garlic', 'lemon juice', 'fresh lime juice', 'parsley', 'oil', 'oil', 'flour', 'onion', 'parmesan cheese', 'cilantro', 'cinnamon', 'lime', 'chicken breasts', 'oninon', 'parsley', 'lime', 'cilantro', 'cream', 'basil', 'sugar', 'jalapeno'])
salt, waterm and sugar are not really "food items" and should be ignored
df_no_season = df_no_redundancy[~df_no_redundancy.ingredients.isin(['salt','sugar','water','cooking spray','baking powder'])]
#Graph : Item by count
fig = px.bar(df_no_season["ingredients"].value_counts()[:30], orientation="v", color=df_no_season["ingredients"].value_counts()[:30], color_continuous_scale=px.colors.sequential.Plasma,
log_x=False, labels={'value':'Count',
'index':'Item',
'color':'None'
})
fig.update_layout(
font_color="black",
title_font_color="red",
legend_title_font_color="green",
title_text="Item by count"
)
fig.show()
# !pip install mlxtend
# !pip install squarify
from mlxtend.preprocessing import TransactionEncoder
import matplotlib
import squarify
basket = df_no_season
basket.head()
transactions = [a[1]['ingredients'].tolist() for a in list(basket.groupby(['id']))]
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
transactions = pd.DataFrame(te_ary, columns=te.columns_)
pf = transactions.describe()
f = pf.iloc[0]-pf.iloc[3]
a = f.tolist()
b = list(f.index)
item = pd.DataFrame([[a[r],b[r]]for r in range(len(a))], columns=['Count','Item'])
item = item.sort_values(['Count'], ascending=False).head(50)
transactions
fig, ax = plt.subplots(figsize=(18,10))
cmap = matplotlib.cm.coolwarm
mini = min(item["Count"])
maxi = max(item["Count"])
norm = matplotlib.colors.Normalize(vmin=mini, vmax=maxi)
colors = [cmap(norm(value)) for value in item["Count"]]
squarify.plot(sizes=item["Count"], label=item["Item"], alpha=0.8, color=colors)
plt.axis('off')
plt.title("Top 50 Frequent Recipe Items", fontsize=32)
ttl = ax.title
ttl.set_position([.5, 1.05])
from mlxtend.frequent_patterns import apriori, association_rules
frequent_itemset = apriori(transactions, min_support=0.009, use_colnames=True, max_len=5)
#frequent_itemsets = apriori(transactions, min_support=0.015, use_colnames=True, max_len=5)
frequent_itemsets = frequent_itemset
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets # This table gives all association rules for basket analysis
b = association_rules(frequent_itemsets, metric="lift", min_threshold=0.001)
b['uni'] = np.nan
b['ant'] = np.nan
b['con'] = np.nan
b['tot'] = 39773
transactions = [a[1]['ingredients'].tolist() for a in list(basket.groupby(['id']))]
def trans():
for t in transactions:
yield t
def ant(x):
cnt = 0
for t in trans():
t = set(t)
if x.intersection(t) == x:
cnt = cnt + 1
return cnt
bb = b.values.tolist()
rules_dict = []
for bbb in bb:
bbb[10] = ant(bbb[0])
bbb[11] = ant(bbb[1])
bbb[9] = ant(bbb[0].union(bbb[1]))
diction = {
'lhs': tuple(bbb[0]),
'rhs': tuple(bbb[1]),
'count_full': bbb[9],
'count_lhs': bbb[10],
'count_rhs': bbb[11],
'num_transactions': bbb[12]
}
rules_dict.append(diction)
# pip install --index-url https://test.pypi.org/simple/ PyARMViz
from PyARMViz import PyARMViz
from PyARMViz.Rule import generate_rule_from_dict
rules = []
for rd in rules_dict:
rules.append(generate_rule_from_dict(rd))
PyARMViz.generate_parallel_category_plot(rules)
PyARMViz.generate_rule_graph_plotly(rules)
PyARMViz.generate_rule_strength_plot(rules)